In [1]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
In [2]:
urls = ["http://mail.scipy.org/pipermail/ipython-dev/",
"http://mail.scipy.org/pipermail/ipython-user/"]#,
#"http://mail.scipy.org/pipermail/scipy-dev/",
#"http://mail.scipy.org/pipermail/scipy-user/",
#"http://mail.scipy.org/pipermail/numpy-discussion/"]
archives= [Archive(url,archive_dir="../archives") for url in urls]
In [3]:
act = archives[0].get_activity()
act1 = archives[1].get_activity()
In [4]:
fig = plt.figure(figsize=(12.5, 7.5))
#act.idxmax().order().T.plot()
(act > 0).idxmax().order().plot()
fig.axes[0].yaxis_date()
In [5]:
timeorder = (act > 0).idxmax().order()
timeorder1 = (act1 > 0).idxmax().order()
In [6]:
archives[1].data[:2]
Out[6]:
In [7]:
for row in archives[0].data[:2].iterrows():
print row[1]["Body"]
In [8]:
arx = archives[0]
In [9]:
k = pd.DataFrame
In [10]:
first_participation = {}
for row in archives[0].data.iterrows():
if row[1]["From"] not in first_participation:
first_participation[row[1]["From"]] = row[1]["Date"]
In [11]:
first_participation1 = {}
for row in archives[1].data.iterrows():
if row[1]["From"] not in first_participation1:
first_participation1[row[1]["From"]] = row[1]["Date"]
In [67]:
#First list
wordcount={}
for row in archives[0].data.iterrows():
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = nltk.tokenize.word_tokenize(k)
for g in t:
try:
word = st.stem(g)
except:
print g
pass
if word in stopwords.words('english'):
continue
if word not in wordcount:
wordcount[word] = [1]
wordcount[word].append(row[0])
wordcount[word].append(row[1]["Date"])
wordcount[word].append(row[1]["From"])
wordcount[word].append(row[1]["In-Reply-To"])
else:
wordcount[word][0] += 1
wd = wordcount #In case
In [13]:
#Second List
wordcount1={}
for row in archives[1].data.iterrows():
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = nltk.tokenize.word_tokenize(k)
for g in t:
try:
word = st.stem(g)
except:
print g
pass
if word in stopwords.words('english'):
continue
if word not in wordcount1:
wordcount1[word] = [1]
wordcount1[word].append(row[0])
wordcount1[word].append(row[1]["Date"])
wordcount1[word].append(row[1]["From"])
wordcount1[word].append(row[1]["In-Reply-To"])
else:
wordcount1[word][0] += 1
In [14]:
#new_df = pd.DataFrame(wordcount.items(),columns=["Word","Others"])
In [15]:
#pd.concat(pd.Series(wordcount.keys()),pd.DataFrame(wordcount.values(),columns=["A","B","C","D","E"]))
In [16]:
#Wordcount information dataframe, with rows as words.
asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
In [17]:
#Wordcount information dataframe, with rows as words.
asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
In [18]:
len(wordcount) #Number of unique words in mailing list1
Out[18]:
In [19]:
len(wordcount1) #Number of unique words in mailing list2
Out[19]:
In [20]:
#Number of same unique words in two mailing lists
samewordcount=0
for word in wordcount:
if word in wordcount1:
samewordcount += 1
samewordcount
Out[20]:
In [21]:
#Total number of same words that are introduced by same people.
samecount = 0
for word in wordcount:
if word in wordcount1:
if wordcount[word][3] == wordcount1[word][3]:
samecount += 1
samecount
Out[21]:
In [22]:
#Among 100-500 appearance words, the number of common words between two mailing-list.
samewordcount = 0
for word in wordcount:
if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
if word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
samewordcount += 1
samewordcount
Out[22]:
In [23]:
#Among 100-500 appearance words, the number of common words between two mailing-list that are first
#introduced by same people
same_person_count = 0
for word in wordcount:
if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
if word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
if wordcount[word][3] == wordcount1[word][3]:
#print word
same_person_count += 1
samecount
Out[23]:
In [24]:
#common word list(introduced by different people in different lists)
commonwords = {}
for word in wordcount:
if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
if word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
if wordcount[word][3] != wordcount1[word][3]:
commonwords[word] = [wordcount[word][0],wordcount[word][3],wordcount[word][2],\
wordcount1[word][0],wordcount1[word][3],wordcount1[word][2]]
In [25]:
len(commonwords)
Out[25]:
In [27]:
#Dataframe of information of those words introduced by different people
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe[:10]
Out[27]:
In [28]:
commonword_differentauthor_dataframe['Date1'][0] < commonword_differentauthor_dataframe['Date1'][1]
Out[28]:
In [29]:
len(commonwords)
Out[29]:
In [30]:
# The list of words that have potential of idea flows. Definition: A is introduced by p in list1 first, then q saw it and then
# introduced the word A to list 2, vice versa. We defined q saw as q said sth in list1 before p poped out the word.
# Total list of such word A.
time_influence = 0
influnce_list = {}
for word in commonwords:
if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
if commonwords[word][1] in first_participation1: #Check if author1 in list2
if first_participation1[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
#in list2 and exists before the word first introduced in list2
influnce_list[word] = commonwords[word]
time_influence += 1
else: #Author1 comes first
if commonwords[word][4] in first_participation:
if first_participation[commonwords[word][4]] < commonwords[word][2]:
influnce_list[word] = commonwords[word]
time_influence += 1
In [31]:
time_influence
Out[31]:
In [32]:
len(influnce_list.keys())
Out[32]:
In [34]:
df2 = pd.DataFrame(influnce_list)
influnce_list_dataframe = df2.transpose()
influnce_list_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
influnce_list_dataframe[:20]
Out[34]:
In [35]:
influence_words = influnce_list.keys()
In [36]:
#reduce the words that only contain numbers (lack of information)
reduced_influence_words = []
for word in influence_words:
if word.isdigit() == False:
reduced_influence_words.append(word)
In [37]:
len(reduced_influence_words)
Out[37]:
In [38]:
reduced_influence_words[:20]
Out[38]:
In [39]:
#Store the list
import csv
with open('test123.csv', 'w') as fp:
a = csv.writer(fp)
data = [reduced_influence_words]
a.writerows(data)
#reduced_influence_words.to_csv()
In [40]:
influnce_list_dataframe.keys()
Out[40]:
In [68]:
for key,value in wd.items():
if value <= 100 or value >= 500:
del wd[key]
In [69]:
wc_array = np.array(wd.values())
In [70]:
wc_array.sort()
In [72]:
len(wordcount)
Out[72]:
In [66]:
#List1's unique words and their count, power law distribution
%matplotlib inline
plt.plot(wcsort_array[:,0])
In [12]:
t = nltk.tokenize.word_tokenize(p)
In [11]:
len(nltk.corpus.stopwords.words('english'))
Out[11]:
In [20]:
a = []
for i in t:
a.append(st.stem(i))
In [ ]: